library(data.table)
library(tidyr)
library(maps)
library(haven)
library(ggplot2)
library(dplyr)
library(readxl)
library(ggrepel)
library(wordcloud)
library(lme4)
library(lmerTest)
PREPARATION DATASETS FOR ANALYSIS #################### #################### ####################
PREP THE DATASET FOR ANALYSIS WVS 5 & 6 ####################
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
WV5_data_df
#rename the variables
WV5_data <- WV5_data_df %>%
rename(gender = V235, age = V237, country_code = V2, wave = V1, risktaking = V86, children = V56, married = V55, employed = V241, education = V238)
WV5_data
colnames(WV5_data)
[1] "wave" "V1A" "V1B" "country_code" "V2A" "V3" "V4"
[8] "V4_CO" "V5" "V5_CO" "V6" "V6_CO" "V7" "V7_CO"
[15] "V8" "V8_CO" "V9" "V9_CO" "V10" "V11" "V12"
[22] "V13" "V14" "V15" "V16" "V17" "V18" "V19"
[29] "V20" "V21" "V22" "V23" "V24" "V25" "V26"
[36] "V27" "V28" "V29" "V30" "V31" "V32" "V33"
[43] "V34" "V35" "V36" "V37" "V38" "V39" "V40"
[50] "V41" "V42" "V43" "V43_01" "V43_02" "V43_03" "V43_04"
[57] "V43_05" "V43_06" "V43_07" "V43_08" "V43_09" "V43_10" "V43_11"
[64] "V43_12" "V43_13" "V43_14" "V43_15" "V43_16" "V43_17" "V43_18"
[71] "V43_19" "V43_20" "V43_21" "V43_22" "V43_23" "V43_24" "V43_25"
[78] "V43_26" "V43_27" "V43_28" "V43_29" "V43_30" "V44" "V45"
[85] "V46" "V47" "V48" "V49" "V50" "V51" "V52"
[92] "V53" "V54" "married" "children" "V57" "V58" "V59"
[99] "V60" "V61" "V62" "V63" "V64" "V65" "V66"
[106] "V67" "V68" "V69" "V69_HK" "V70" "V70_HK" "V71"
[113] "V72" "V73" "V73_HK" "V74" "V74_HK" "V75" "V76"
[120] "V77" "V78" "V79" "V80" "V81" "V82" "V83"
[127] "V84" "V85" "risktaking" "V87" "V88" "V89" "V90"
[134] "V91" "V92" "V93" "V94" "V95" "V96" "V97"
[141] "V98" "V99" "V100" "V101" "V102" "V103" "V104"
[148] "V105" "V106" "V107" "V108" "V109" "V110" "V111"
[155] "V112" "V113" "V114" "V115" "V116" "V117" "V118"
[162] "V119" "V120" "V121" "V122" "V123" "V124" "V125"
[169] "V126" "V127" "V128" "V129" "V130" "V130_CA_1" "V130_IQ_1"
[176] "V130_IQ_2" "V130_IQ_3" "V130_IQ_4" "V130_NZ_1" "V130_NZ_2" "V131" "V132"
[183] "V133" "V134" "V135" "V136" "V137" "V138" "V139"
[190] "V140" "V141" "V142" "V143" "V144" "V145" "V146_00"
[197] "V146_01" "V146_02" "V146_03" "V146_04" "V146_05" "V146_06" "V146_07"
[204] "V146_08" "V146_09" "V146_10" "V146_11" "V146_12" "V146_13" "V146_14"
[211] "V146_15" "V146_16" "V146_17" "V146_18" "V146_19" "V146_20" "V146_21"
[218] "V146_22" "V147" "V148" "V149" "V150" "V151" "V151_IQ_A"
[225] "V151_IQ_B" "V152" "V153" "V154" "V155" "V156" "V157"
[232] "V158" "V159" "V160" "V161" "V162" "V163" "V164"
[239] "V165" "V166" "V167" "V168" "V169" "V170" "V171"
[246] "V172" "V173" "V174" "V175" "V176" "V177" "V178"
[253] "V179" "V180" "V181" "V182" "V183" "V184" "V185"
[260] "V186" "V187" "V188" "V189" "V190" "V191" "V192"
[267] "V193" "V194" "V195" "V196" "V197" "V198" "V199"
[274] "V200" "V201" "V202" "V203" "V204" "V205" "V206"
[281] "V207" "V208" "V209" "V210" "V211" "V212" "V213A"
[288] "V213B" "V213C" "V213D" "V213E" "V213F" "V213G" "V213H"
[295] "V213K" "V213L" "V213M" "V213N" "V214" "V215" "V216"
[302] "V217" "V218" "V219" "V220" "V221" "V222" "V223"
[309] "V224" "V225" "V226" "V227" "V228" "V229" "V230"
[316] "V231" "V232" "V233" "V233A" "V234" "gender" "V236"
[323] "age" "education" "V238CS" "V239" "V240" "employed" "V242"
[330] "V242A_CO" "V243" "V244" "V245" "V246" "V247" "V248"
[337] "V249" "V250" "V251" "V252" "V252B" "V253" "V253CS"
[344] "V254" "V255" "V255CS" "V256" "V257" "V257B" "V257C"
[351] "V258" "V259" "V259A" "V260" "V261" "V262" "V263"
[358] "V264" "V265" "S024" "S025" "Y001" "Y002" "Y003"
[365] "SACSECVAL" "SECVALWGT" "RESEMAVAL" "WEIGHTB" "I_AUTHORITY" "I_NATIONALISM" "I_DEVOUT"
[372] "DEFIANCE" "WEIGHT1A" "I_RELIGIMP" "I_RELIGBEL" "I_RELIGPRAC" "DISBELIEF" "WEIGHT2A"
[379] "I_NORM1" "I_NORM2" "I_NORM3" "RELATIVISM" "WEIGHT3A" "I_TRUSTARMY" "I_TRUSTPOLICE"
[386] "I_TRUSTCOURTS" "SCEPTICISM" "WEIGHT4A" "I_INDEP" "I_IMAGIN" "I_NONOBED" "AUTONOMY"
[393] "WEIGHT1B" "I_WOMJOB" "I_WOMPOL" "I_WOMEDU" "EQUALITY" "WEIGHT2B" "I_HOMOLIB"
[400] "I_ABORTLIB" "I_DIVORLIB" "CHOICE" "WEIGHT3B" "I_VOICE1" "I_VOICE2" "I_VOI2_00"
[407] "VOICE" "WEIGHT4B" "S001" "S007" "S018" "S019" "S021"
[414] "COW"
#select only the variables of interest
WV5_data <- WV5_data %>%
dplyr::select(gender, age, country_code, wave, risktaking, children, employed, education, married)
WV5_data
countrynames <- read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header = FALSE, as.is = TRUE)
colnames(countrynames) <- c("code", "name")
# Assuming WV5_data has a column named country_code
WV5_data$country <- countrynames$name[match(WV5_data$country_code, countrynames$code)]
# Check the frequency of each country in the new column
table(WV5_data$country)
Andorra Argentina Australia Brazil Bulgaria Burkina Faso
1003 1002 1421 1500 1001 1534
Canada Chile China Colombia Cyprus (G) Egypt
2164 1000 1991 3025 1050 3051
Ethiopia Finland France Georgia Germany Ghana
1500 1014 1001 1500 2064 1534
Great Britain Guatemala Hong Kong Hungary India Indonesia
1041 1000 1252 1007 2001 2015
Iran Iraq Italy Japan Jordan Malaysia
2667 2701 1012 1096 1200 1201
Mali Mexico Moldova Morocco Netherlands New Zealand
1534 1560 1046 1200 1050 954
Norway Peru Poland Romania Russia Rwanda
1025 1500 1000 1776 2033 1507
Slovenia South Africa South Korea Spain Sweden Switzerland
1037 2988 1200 1200 1003 1241
Taiwan Thailand Trinidad and Tobago Turkey Ukraine United States
1227 1534 1002 1346 1000 1249
Uruguay Viet Nam Zambia
1000 1495 1500
# Display the updated WV5_data
print(WV5_data)
#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
#rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, gender = V240, age = V242,country_code = V2, risktaking = V76, children = V58, married = V57, employed = V229, education = V248)
#select only the variables of interest
WV6_data <- WV6_data %>%
dplyr::select(gender, age, country_code, wave, risktaking, children, employed, education, married)
WV6_data
#decode daraset (Wave 6)
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country = countrynames$name [match(WV6_data$country_code, countrynames$code)]
table(WV6_data$country)
Algeria Argentina Armenia Australia Azerbaijan Belarus
1200 1030 1100 1477 1002 1535
Brazil Chile China Colombia Cyprus (G) Ecuador
1486 1000 2300 1512 1000 1202
Egypt Estonia Georgia Germany Ghana Haiti
1523 1533 1202 2046 1552 1996
Hong Kong India Iraq Japan Jordan Kazakhstan
1000 4078 1200 2443 1200 1500
Kuwait Kyrgyzstan Lebanon Libya Malaysia Mexico
1303 1500 1200 2131 1300 2000
Morocco Netherlands New Zealand Nigeria Pakistan Palestine
1200 1902 841 1759 1200 1000
Peru Philippines Poland Qatar Romania Russia
1210 1200 966 1060 1503 2500
Rwanda Singapore Slovenia South Africa South Korea Spain
1527 1972 1069 3531 1200 1189
Sweden Taiwan Thailand Trinidad and Tobago Tunisia Turkey
1206 1238 1200 999 1205 1605
Ukraine United States Uruguay Uzbekistan Yemen Zimbabwe
1500 2232 1000 1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
WVS_data = rbind(WV5_data, WV6_data)
WVS_data
#exclusion of participants and omission of missing data (na)
WVS_data = subset(WVS_data, risktaking > 0 & gender > 0 & age >0 & education > 0 & employed > 0 & married > 0 & children >= 0)
data_Wave5 = subset(WV5_data, risktaking > 0 & gender > 0 & age >0 & education > 0 & employed > 0 & married > 0 & children >= 0)
data_Wave6 = subset(WV6_data, risktaking > 0 & gender > 0 & age >0 & education > 0 & employed > 0 & married > 0 & children >= 0)
WVS_data <- na.omit(WVS_data)
data_Wave5 <- na.omit(data_Wave5)
data_Wave6 <- na.omit(data_Wave6)
# Use the mutate function to change the country name
WVS_data <- WVS_data %>%
mutate(country = ifelse(country == "Great Britain", "United Kingdom", country))
#Transformation of item risktaking
# Transfrom risk item such that high values represent more risk taking
WVS_data$risktaking = 6 - WVS_data$risktaking + 1
# Transform risk variable into T-score (mean = 50, sd = 10)
WVS_data$T_score_risktaking = 10*scale(WVS_data$risktaking, center=TRUE,scale=TRUE)+50
WVS_data
#Transform risk variable into Z score
# Assuming T-scores have a mean of 50 and a standard deviation of 10
WVS_data$Z_score_risktaking = (WVS_data$T_score_risktaking - 50) / 10
# Print the resulting data frame
print(WVS_data)
WVS_data <- WVS_data %>%
group_by(country) %>%
mutate(z_score_age = scale(age))
WVS_data
WVS_data$gender = ifelse(WVS_data$gender == 1, 0, 1) # sex: male vs. female
WVS_data$children = ifelse(WVS_data$children == 0, 0, 1) # children: no vs. yes
WVS_data$married = ifelse(WVS_data$married == 1, 1, 0) # married: yes vs. no
WVS_data$employed = ifelse(WVS_data$employed < 4, 1, 0) # employed: yes vs. no
WVS_data$education = ifelse(WVS_data$education < 4, 0, 1) # education: no primary vs. primary+
head(WVS_data)
PREP THE DATASET FOR ANALYSIS GPS ####################
#Add data GPS
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
head(gps_data)
gps_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Display the cleaned data
gps_data
#select only the variables of interest
gps_data <- gps_data %>%
dplyr::select(country, isocode, ison, risktaking, gender, age)
gps_data
gps_data <- gps_data %>%
group_by(country) %>%
mutate(z_score_age = scale(age))
# Display the new column with Z-Scores per Country
gps_data
PREP THE DATASET FOR ANALYSIS HARDSHIP ####################
excel_path <- "/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/Hardship/Hardship_complete_2024.xlsx"
hardship <- read_excel(excel_path)
# Create a vector of labels with the same length as the number of columns in 'countryfacts'
labels <- c("country","mean_homicide","gdp","gini_income","Infant_mortality","life_expect","primary_female_enrollment_rate")
# Print the result
print(hardship)
# Create the 'hardship' column in the 'hardship' data frame
hardship <- hardship %>%
mutate(hardship = (mean_homicide + gdp + gini_income + Infant_mortality + life_expect + primary_female_enrollment_rate) / 6)
hardship
hardship$mean_homicide=log(hardship$mean_homicide)
hardship$gdp=log(hardship$gdp)
hardship$Infant_mortality=log(hardship$Infant_mortality)
hardship$life_expect=log(hardship$life_expect)
hardship$gini_income=log(hardship$gini_income)
hardship$primary_female_enrollment_rate=log(hardship$primary_female_enrollment_rate)
# changing variables into the same direction
# Reverse Codierung
hardship$mean_homicide=scale(hardship$mean_homicide)
hardship$gdp=scale(-hardship$gdp)
hardship$Infant_mortality=scale(hardship$Infant_mortality)
hardship$life_expect=scale(-hardship$life_expect)
hardship$gini_income=scale(hardship$gini_income)
hardship$primary_female_enrollment_rate=scale(hardship$primary_female_enrollment_rate)
hardship
hardship$hardship=(hardship$mean_homicide+hardship$gdp+hardship$gini_income+hardship$life_expect+hardship$Infant_mortality+hardship$primary_female_enrollment_rate)/6
hardship
x# SUP MATERIALS:Correlation between hardship indicators
# Berechnung der Korrelationsmatrix für den Datensatz "hardship"
correlation_hardship <- cor(hardship[, c("mean_homicide", "gdp", "gini_income", "Infant_mortality", "life_expect", "primary_female_enrollment_rate")])
# Visualisierung der Korrelationsmatrix als Heatmap
library(ggplot2)
library(reshape2)
correlation_hardship_melted <- melt(correlation_hardship)
ggplot(correlation_hardship_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
coord_fixed()
correlation_hardship <- cor(hardship[, c("mean_homicide", "gdp", "gini_income", "Infant_mortality", "life_expect", "primary_female_enrollment_rate")])
# Erstellen einer Tabelle für die Korrelationsmatrix
correlation_table <- as.data.frame(correlation_hardship)
# Anzeigen der Tabelle
print(correlation_table)
#select only the variables of interest
hardship <- hardship %>%
dplyr::select(country, isocode, hardship)
hardship
PREP THE DATASET FOR ANALYSIS MIXED-MODELS ####################
#Add Hardship to WVS_data
head(WVS_data)
WVS_mixed_model <- left_join(WVS_data, hardship, by = "country")
WVS_mixed_model
head(WVS_mixed_model)
#Add Hardship to gps_data
gps_mixed_model <- left_join(gps_data, hardship, by = "country")
gps_mixed_model
head(gps_mixed_model)
MIXED-MODELS #################### #################### ####################
MIXED-MODELS WVS-DATA ####################
model0 = lmer(risktaking ~ 1 + (1|country),data = WVS_mixed_model)
summary_model0=summary(model0)
model1 <- lmer(risktaking ~ 1 + scale(z_score_age) + factor(gender) + (1 + scale(z_score_age) + factor(gender) | country),
data = WVS_mixed_model,
control = lmerControl(optimizer = "bobyqa"))
# Zusammenfassung des Modells anzeigen
summary_model1 <- summary(model1)
# Gewünschte Werte extrahieren und formatieren
results_model1 <- data.frame(
Predictor = c("Intercept", "Age", "Gender"),
Estimate = c(summary_model1$coefficients["(Intercept)", "Estimate"],
summary_model1$coefficients["scale(z_score_age)", "Estimate"],
summary_model1$coefficients["factor(gender)1", "Estimate"]),
SE = c(summary_model1$coefficients["(Intercept)", "Std. Error"],
summary_model1$coefficients["scale(z_score_age)", "Std. Error"],
summary_model1$coefficients["factor(gender)1", "Std. Error"]),
T_score = c(summary_model1$coefficients["(Intercept)", "t value"],
summary_model1$coefficients["scale(z_score_age)", "t value"],
summary_model1$coefficients["factor(gender)1", "t value"]),
p_value = c(summary_model1$coefficients["(Intercept)", "Pr(>|t|)"],
summary_model1$coefficients["scale(z_score_age)", "Pr(>|t|)"],
summary_model1$coefficients["factor(gender)1", "Pr(>|t|)"])
)
# Formatierung der p-Werte
results_model1$p_value <- ifelse(results_model1$p_value < 0.001, "< .001", sprintf("%.3f", results_model1$p_value))
# Ergebnisse anzeigen
print(results_model1)
model2 = lmer(risktaking ~ 1+scale(z_score_age)+factor(gender) + factor(children) + factor(married) + factor(employed) + factor(education)+ (1+scale(z_score_age)+factor(gender)+ factor(children) + factor(married) + factor(employed) + factor(education)|country),data = WVS_mixed_model,control=lmerControl(optCtrl=list(maxfun=30000),optimizer="bobyqa"))
summary_model2=summary(model2)
# Zusammenfassung des Modells anzeigen
summary_model2 <- summary(model2)
# Gewünschte Werte extrahieren und formatieren
results_model2 <- data.frame(
Predictor = c("Intercept", "Age", "Gender", "Parental status", "Marital status", "Occupational status", "Education"),
Estimate = c(summary_model2$coefficients["(Intercept)", "Estimate"],
summary_model2$coefficients["scale(z_score_age)", "Estimate"],
summary_model2$coefficients["factor(gender)1", "Estimate"],
summary_model2$coefficients["factor(children)1", "Estimate"],
summary_model2$coefficients["factor(married)1", "Estimate"],
summary_model2$coefficients["factor(employed)1", "Estimate"],
summary_model2$coefficients["factor(education)1", "Estimate"]),
SE = c(summary_model2$coefficients["(Intercept)", "Std. Error"],
summary_model2$coefficients["scale(z_score_age)", "Std. Error"],
summary_model2$coefficients["factor(gender)1", "Std. Error"],
summary_model2$coefficients["factor(children)1", "Std. Error"],
summary_model2$coefficients["factor(married)1", "Std. Error"],
summary_model2$coefficients["factor(employed)1", "Std. Error"],
summary_model2$coefficients["factor(education)1", "Std. Error"]),
T_score = c(summary_model2$coefficients["(Intercept)", "t value"],
summary_model2$coefficients["scale(z_score_age)", "t value"],
summary_model2$coefficients["factor(gender)1", "t value"],
summary_model2$coefficients["factor(children)1", "t value"],
summary_model2$coefficients["factor(married)1", "t value"],
summary_model2$coefficients["factor(employed)1", "t value"],
summary_model2$coefficients["factor(education)1", "t value"]),
p_value = c(summary_model2$coefficients["(Intercept)", "Pr(>|t|)"],
summary_model2$coefficients["scale(z_score_age)", "Pr(>|t|)"],
summary_model2$coefficients["factor(gender)1", "Pr(>|t|)"],
summary_model2$coefficients["factor(children)1", "Pr(>|t|)"],
summary_model2$coefficients["factor(married)1", "Pr(>|t|)"],
summary_model2$coefficients["factor(employed)1", "Pr(>|t|)"],
summary_model2$coefficients["factor(education)1", "Pr(>|t|)"])
)
# Formatierung der p-Werte
results_model2$p_value <- ifelse(results_model2$p_value < 0.001, "< .001", sprintf("%.3f", results_model2$p_value))
# Ergebnisse anzeigen
print(results_model2)
model3 <- lmer(risktaking ~ 1+scale(z_score_age)*hardship+factor(gender)*hardship + factor(children) + factor(married) + factor(employed) + factor(education)+ (1+scale(z_score_age)+factor(gender)+ factor(children) + factor(married) + factor(employed) + factor(education)|country),data = WVS_mixed_model,control=lmerControl(optCtrl=list(maxfun=30000),optimizer="bobyqa"),REML = FALSE)
summary_model3=summary(model3)
anova(model0,model1)
anova(model1,model2)
anova(model2,model3)
coefsallmodels=rbind(summary_model1$coefficients,
summary_model2$coefficients,
summary_model3$coefficients[c(1:2,4:8,3,9:10),])
write.csv(coefsallmodels,"coefsallmodels.csv")
file_path <- file.path(getwd(), "coefsallmodels.csv")
file_path
# Extrahieren der Koeffizienten-Tabelle für jedes Modell
coefficients_model0 <- summary(model0)$coefficients
coefficients_model1 <- summary(model1)$coefficients
coefficients_model2 <- summary(model2)$coefficients
coefficients_model3 <- summary(model3)$coefficients
# Filtern der erforderlichen Zeilen aus den Koeffizienten
coefficients_model0 <- coefficients_model0[rownames(coefficients_model0) %in% c("(Intercept)", "scale(z_score_age)", "factor(gender)"), ]
coefficients_model1 <- coefficients_model1[rownames(coefficients_model1) %in% c("(Intercept)", "scale(z_score_age)", "factor(gender)"), ]
coefficients_model2 <- coefficients_model2[rownames(coefficients_model2) %in% c("(Intercept)", "scale(z_score_age)", "factor(gender)", "factor(children)", "factor(married)", "factor(employed)", "factor(education)"), ]
coefficients_model3 <- coefficients_model3[rownames(coefficients_model3) %in% c("(Intercept)", "scale(z_score_age)", "factor(gender)", "factor(children)", "factor(married)", "factor(employed)", "factor(education)", "hardship", "scale(z_score_age):hardship", "factor(gender):hardship"), ]
# Zusammenführen der geschätzten Koeffizienten aus allen Modellen
coefs_all_models <- rbind(coefficients_model0, coefficients_model1, coefficients_model2, coefficients_model3)
# Erstellen einer Tabelle aus den Koeffizienten
results_table <- data.frame(
Predictor = rownames(coefs_all_models),
b = coefs_all_models[, "Estimate"],
SE = coefs_all_models[, "Std. Error"],
T_score = coefs_all_models[, "t value"],
p_value = coefs_all_models[, "Pr(>|t|)"]
)
# Drucken der Ergebnistabelle
print(results_table)